Analysis based primarily on the Sun and Vera et. al. 2020 paper with the code from https://github.com/BROOKELAB/SingleCell/tree/fa170e4c16f64670bada6ef56ae9cf1ffafb4786 and the Seurat database vignettes https://github.com/satijalab/seurat/blob/master/vignettes/pbmc3k_tutorial.Rmd
Preliminary analysis with regressing out the cell cycle markers that skew the clustering
# load required libraries
library(Seurat)
library(ggplot2)
library(sctransform)
library(dplyr)
Read in the .h5 file using Seurat function. Load in the count matrix as a Seurat object.
# read in Cal07 filtered data
cal07_a549_counts <- Read10X_h5(
"/Users/ethayer/Google Drive/Grad School/Brooke Lab/Data/polyIC_scRNAseq/03_scRNAseq_h5/Cal07_aggr/filtered_feature_bc_matrix.h5",
use.names = TRUE, unique.features = TRUE)
# load the data into Seurat for analysis
a549 <- CreateSeuratObject(counts = cal07_a549_counts, min.cells = 4,
min.features = 400, project = "cal07_mock")
Organize the data into two levels based off of the barcode ids (in this case Cal07(1) or Mock(2)).
# split the rownames by -, then select only the second element
# (the number 1 (cal07) or 2 (mock))
experiment_id <- factor(sapply(strsplit(rownames(a549@meta.data), "-"),
function(x) x[2]))
# set the factor levels
levels(experiment_id) <- c("Cal07", "Mock")
# Add it back to the seurat object meta-data
a549@meta.data$experiment_id <- experiment_id
Look at QC metrics of the data.
# Show QC metrics for the first/last 5 cells
head(a549@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA experiment_id
## AAACCCAAGGTAGACC-1 cal07_mock 43266 5875 Cal07
## AAACCCAAGGTAGCAC-1 cal07_mock 3008 538 Cal07
## AAACCCACACGACTAT-1 cal07_mock 46286 5888 Cal07
## AAACCCAGTAGCTCGC-1 cal07_mock 40966 5854 Cal07
## AAACGAAAGTCTAACC-1 cal07_mock 35460 5214 Cal07
summary(a549@meta.data$experiment_id)
## Cal07 Mock
## 5019 7243
# Visualize QC metrics as a violin plot
VlnPlot(a549, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
# Look at the relationship between counts and features
FeatureScatter(a549, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
group.by = "experiment_id")
Filtering based on QC. Low feature could mean this is a droplet. High count/feature indicates a doublet.
# Subset to a new object based off of the QC data
a549_01 <- subset(a549,
subset = nFeature_RNA > 500 & nFeature_RNA < 8500 & nCount_RNA < 115000)
# Do all the QC stuff again with the new object
head(a549_01@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA experiment_id
## AAACCCAAGGTAGACC-1 cal07_mock 43266 5875 Cal07
## AAACCCAAGGTAGCAC-1 cal07_mock 3008 538 Cal07
## AAACCCACACGACTAT-1 cal07_mock 46286 5888 Cal07
## AAACCCAGTAGCTCGC-1 cal07_mock 40966 5854 Cal07
## AAACGAAAGTCTAACC-1 cal07_mock 35460 5214 Cal07
# Visualize QC metrics as a violin plot
VlnPlot(a549_01, features = c("nFeature_RNA", "nCount_RNA"), ncol = 2)
# Look at the relationship between counts and features
FeatureScatter(a549_01, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
group.by = "experiment_id")
Loading in the cell cycle markers from Tirosh et. al., 2015 with Seurat. We can then assign cell cycle scores.
# Loading in the markers
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
# Assign cell cycle scores based off of the imported cycle markers above
a549_01 <- CellCycleScoring(
a549_01,
s.features = s.genes,
g2m.features = g2m.genes,
set.ident = TRUE)
# view cell cycle scores and phase assignments
head(a549_01[[]])
## orig.ident nCount_RNA nFeature_RNA experiment_id
## AAACCCAAGGTAGACC-1 cal07_mock 43266 5875 Cal07
## AAACCCAAGGTAGCAC-1 cal07_mock 3008 538 Cal07
## AAACCCACACGACTAT-1 cal07_mock 46286 5888 Cal07
## AAACCCAGTAGCTCGC-1 cal07_mock 40966 5854 Cal07
## AAACGAAAGTCTAACC-1 cal07_mock 35460 5214 Cal07
## AAACGAAGTGTAAATG-1 cal07_mock 33306 5056 Cal07
## S.Score G2M.Score Phase old.ident
## AAACCCAAGGTAGACC-1 -0.386699507 -6.9018137 G1 cal07_mock
## AAACCCAAGGTAGCAC-1 -0.007799672 -0.5129248 G1 cal07_mock
## AAACCCACACGACTAT-1 -1.211617406 -7.4227568 G1 cal07_mock
## AAACCCAGTAGCTCGC-1 -1.257594417 -6.7613211 G1 cal07_mock
## AAACGAAAGTCTAACC-1 -0.964901478 -6.0331424 G1 cal07_mock
## AAACGAAGTGTAAATG-1 -0.397783251 -5.4420962 G1 cal07_mock
# different scaling method which allows you to specify variables contributing to the scaling/normalization of the data
a549_01 <- SCTransform(
a549_01, method = "glmGamPoi",
vars.to.regress = c("S.Score", "G2M.Score"),
verbose = FALSE)
Looking at which dimensions contribute the most to the percentage of variance
# PCA to find similar genes/cells for clustering
a549_01 <- RunPCA(a549_01, verbose = FALSE)
# Best explanation from https://github.com/satijalab/seurat/blob/master/vignettes/pbmc3k_tutorial.Rmd:
# "‘Elbow plot’: a ranking of principle components based on the percentage of variance explained by each one"
# I chose 15, as there is little drop off after that and it is best to be conservative when picking your dimensions to not leave anything out
ElbowPlot(a549_01)
Performing dimensional reduction and clustering
# making UMAP dimensional reduction
a549_01 <- RunUMAP(a549_01, dims = 1:15, verbose = FALSE)
a549_01 <- FindNeighbors(a549_01, dims = 1:15, verbose = FALSE)
a549_01 <- FindClusters(a549_01, verbose = FALSE, resolution = 0.5)
DimPlot(a549_01, label = TRUE) + NoLegend()
Want to make sure the cell cycle genes are regressed out
# loop to plot each UMAP with colored cell cycle genes one by one
for (x in g2m.genes[g2m.genes %in% VariableFeatures(a549)]) {
g <- FeaturePlot(a549_01, x, order = TRUE, pt.size = 1)
print(g)
}
# UMAP color/clustering based on classification of cell cycle
# making sure the cell cycle genes are widely distributed
FeaturePlot(a549_01, "G2M.Score")
Identifying the most variable genes
# identify features that are outliers on a 'mean variability plot'
a549_01 <- FindVariableFeatures(a549_01, selection.method = "vst", nfeatures = 2000)
# Identify the 25 most highly variable genes
top25 <- head(VariableFeatures(a549_01), 15)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(a549_01)
plot2 <- LabelPoints(plot = plot1, points = top25, repel = TRUE,
max.overlaps = 20, xnudge = 0, ynudge = 0)
plot2
Where in the clustering are the viral genes being expressed
# where are the infected cells in the dimensional reduction
segment_gene_names <- c("NS", "NA.", "HA", "NP", "PB2", "PB1", "PA", "M")
# loop to plot each UMAP with colored flu genes one by one
for (x in segment_gene_names) {
g <- FeaturePlot(a549_01, x, order = TRUE, pt.size = 1)
print(g)
}